Aim

To assess the quality of whole transcriptomic Iso-Seq runs using ERCC as benchmark
- Number of reads mapped to ERCC (difference between WT and TG?)
- Correlation of ERCC known concentration and FL Reads per ERCC detected
- Features of those ERCCs not detected
- Additional: saturation of detecting ERCCs

Chain: Number of Reads

Pipeline

This was performed by merging only WT (n = 6), only TG (n = 6) and all the samples (n = 12) together at Isoseq3 Cluster, align to ERCC, cupcake collapse and chain three datasets, followed by SQANTI.

Pipeline for analysis

Pipeline for analysis

Classification file

Redundant Isoforms from Chained Data

Plot is drawn from All_Merged dataset in Sqanti chained output

All: Number of Reads

Pipeline

Pipeline for analysis

Pipeline for analysis

Classification file

Redundant Isoforms

Demultiplex: Number of Reads

This was performed by merging only WT (n = 6), only TG (n = 6) and all the samples (n = 12) together at Isoseq3 Cluster, align to ERCC, cupcake collapse, demultiplex, followed by SQANTI.
Note: only 10 samples had ERCCs

Pipeline

Pipeline for analysis

Pipeline for analysis

Classification file

Unique ERCC

## Total unique ERCCs in both WT and TG: 37 (40.22%)
##   Phenotype Mean_ERCC_Detected  perc
## 1        TG               32.2 35.00
## 2        WT               32.4 35.22

Redundant Isoforms

## Number of ERCC with more than one isoform 8 ( 8.7 %)

+ TAMA

To use TAMA’s tama_remove_fragment_models.py to remove shorter redundant “isoforms” from the same ERCCs. Script is applied after SQANTI filtering.

TAMA <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/All_Samples/ERCC/TOFU/TAMA_Filter/"
tama_retained <- read.table(paste0(TAMA,"All_Merged_postsqanti.bed")) %>% mutate(TAMA = "Retained") 
tama_discard <- read.table(paste0(TAMA,"All_Merged_postsqanti_discarded.txt")) %>% mutate(TAMA = "Discarded")
tama <- bind_rows(tama_retained, tama_discard) %>% mutate(isoform = word(.$V4, c(2),sep = ";"))

p9 <- tama %>% group_by(V1,TAMA) %>% tally() %>%
  ggplot(., aes(x = reorder(V1,-n), y = n, fill = TAMA)) + geom_bar(stat = "identity") + 
  labs(y = "Number of Isoforms", x = "", title = "\n\n") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        legend.title = element_blank(), legend.position = c(0.9, 0.9)) +
  mytheme 

# reads filtered from TAMA 
# total FL reads across 10 samples associated with isoform 
p10 <- class[class$chrom %in% tama_discard$V1,] %>%
  mutate(Total_FL = .[, 46:55] %>% rowSums(na.rm = TRUE)) %>%
  left_join(., tama, by = ("isoform")) %>% 
  mutate(log10_Total_FL = log10(Total_FL)) %>%
  ggplot(., aes(x = chrom, y = log10_Total_FL, colour = TAMA)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  mytheme + labs(y = "Full-Length Reads (Log10)", x = "", title = "\n\n") 

p11 <- class %>%
  mutate(Total_FL = .[, 46:55] %>% rowSums(na.rm = TRUE)) %>%
  left_join(., tama, by = ("isoform")) %>% 
  mutate(log10_Total_FL = log10(Total_FL)) %>%
  ggplot(., aes(x = chrom, y = log10_Total_FL, colour = TAMA)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  mytheme + labs(y = "Full-Length Reads (Log10)", x = "") 

p9 

Correlation

## 
##  Pearson's product-moment correlation
## 
## data:  dat[[x.var]] and dat[[y.var]]
## t = 8.5057, df = 35, p-value = 4.89e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6770173 0.9043953
## sample estimates:
##       cor 
## 0.8209479 
## 
## [1] ""
## [1] "Correlation betweenlog2_amount_of_ERCCandlog2_FL_reads"
## [1] "corr.value0.820947870699425"
## [1] "p.value4.89015691673981e-10"

Mapping: Filtered ERCCs

Number of reads mapped

Merged

All_Samples_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/All_Samples/"

# ERCC_mapped, ERCC_unmapped, Mouse all, Mouse mapped, mouse unmapped
Merged_mapping <- list(
  ## ERCC
  read.table(paste0(All_Samples_dir,"/ERCC/MAPPING/All_Merged_reads_with_alignment_statistics.txt")),
  read.table(paste0(All_Samples_dir, "/ERCC/MAPPING/All_Merged.notread.clustered.hq.fastq.paf"), as.is = T, sep = "\t") %>%
    mutate(transcript = word(V1, c(1), sep = fixed(" "))),
  
  ## Mouse 
  read.table(paste0(All_Samples_dir,"/MOUSE/MAPPING/PAF/All_Merged.allread.clustered.hq.fastq.paf"), as.is = T, sep = "\t") %>%
    mutate(transcript = word(V1, c(1), sep = fixed(" "))),  
  read.table(paste0(All_Samples_dir, "/MOUSE/MAPPING/PAF/All_Merged_reads_with_alignment_statistics.txt"), as.is = T, sep = "\t") %>%
    mutate(transcript = word(V1, c(1), sep = fixed(" "))),
  read.table(paste0(All_Samples_dir,"/MOUSE/MAPPING/PAF/All_Merged.notread.clustered.hq.fastq.paf"), as.is = T, sep = "\t")
) 

names(Merged_mapping) <- c("ERCC_mapped","ERCC_unmapped","Mouse_all","Mouse_mapped","Mouse_unmapped")

#total_num
# number of total FL reads being mapped
# number of mapped ERCC reads  
# number of mapped mouse reads
# number of unmapped reads

total_num <- data.frame("total_fl" = nrow(Merged_mapping$Mouse_all),
                        "ERCC_mapped" = nrow(Merged_mapping$ERCC_mapped),
                        "Mouse_mapped" = nrow(Merged_mapping$Mouse_mapped),
                        "Mouse_unmapped" = length(setdiff(Merged_mapping$Mouse_unmapped$V1,Merged_mapping$ERCC_mapped$V1)))

total_num <- total_num %>% melt() %>% mutate(perc = value/total_num$total_fl * 100) %>%
  filter(variable != "total_fl") %>% 
  mutate(sample = "Merged") %>% 
  mutate(phenotype = "Merged") %>% 
  mutate(all_reads = total_num$total_fl) %>%
  select(sample, phenotype, variable, value, all_reads) %>%
  `colnames<-`(c("sample", "phenotype","type", "total_reads", "all_reads")) %>% 
  mutate(perc = total_reads/all_reads * 100) %>%
  mutate(dataset = "merged")

Individual

############# Individual
Individual_Mouse_Mapping_dir = "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/Individual/MAPPING"
Individual_ERCC_Mapping_dir = "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/Individual/ERCC_MAPPING"

Individual_Mapping <- list(
  # ERCC 
  # do not include Q21 and O18 as did not use ERCCs
  lapply(grep(list.files(path = Individual_ERCC_Mapping_dir, pattern = "reads_with_alignment_statistics.txt", full.names = T),
                     pattern=c("Q21|O18"), invert=TRUE, value=TRUE),
       function(x) read.table(x,as.is = T, sep = "\t") %>% 
         mutate(transcript = word(V1, c(1), sep = fixed(" ")))),
  
  # Mouse
  lapply(list.files(path = Individual_Mouse_Mapping_dir, pattern = "allread.clustered.hq.fastq.paf", full.names = T),
       function(x) read.table(x,as.is = T, sep = "\t") %>% 
         mutate(transcript = word(V1, c(1), sep = fixed(" ")))),
  lapply(list.files(path = Individual_Mouse_Mapping_dir, pattern = "reads_with_alignment_statistics.txt", full.names = T),
       function(x) read.table(x,as.is = T, sep = "\t") %>% mutate(transcript = word(V1, c(1), sep = fixed(" ")))),
  lapply(list.files(path = Individual_Mouse_Mapping_dir, pattern = "notread.clustered.hq.fastq.paf", full.names = T),
       function(x) read.table(x,as.is = T, sep = "\t"))
)
names(Individual_Mapping) <- c("ERCC_mapped","all","mapped","unmapped")
# the sample name input into each list is the same order (alphabetical)
names(Individual_Mapping$ERCC_mapped) <- grep(list.files(path = Individual_ERCC_Mapping_dir, pattern = "reads_with_alignment_statistics.txt"),pattern=c("Q21|O18"), invert=TRUE, value=TRUE)
names(Individual_Mapping$all) <- list.files(path = Individual_Mouse_Mapping_dir, pattern = "allread.clustered.hq.fastq.paf")
names(Individual_Mapping$mapped) <- list.files(path = Individual_Mouse_Mapping_dir, pattern = "reads_with_alignment_statistics.txt")
names(Individual_Mapping$unmapped) <- list.files(path = Individual_Mouse_Mapping_dir, pattern = "notread.clustered.hq.fastq.paf")

individual_total_num <- bind_rows(
  ldply(Individual_Mapping$ERCC_mapped, function(x) nrow(x)) %>% mutate(sample = word(.id, c(1), sep = fixed("_"))) %>% 
         mutate(type = "ERCC_mapped"), 
  ldply(Individual_Mapping$mapped, function(x) nrow(x)) %>% mutate(sample = word(.id, c(1), sep = fixed("_"))) %>%
         mutate(type = "Mouse_mapped"),
  ldply(Individual_Mapping$unmapped, function(x) nrow(x)) %>% mutate(sample = word(.id, c(1), sep = fixed("."))) %>%
         mutate(type = "Mouse_unmapped")
)

individual_total_reads <- ldply(Individual_Mapping$all, function(x) nrow(x)) %>% mutate(sample = word(.id, c(1), sep = fixed(".")))
individual_total_num <- individual_total_num %>% 
  mutate(phenotype = ifelse(.$sample %in% c("O18","K18","S18","L22","Q20","K24"), "TG","WT")) %>%
  left_join(., individual_total_reads, by = "sample") %>%
  select(sample, phenotype, type, V1.x, V1.y) %>%
  `colnames<-`(c("sample", "phenotype","type", "total_reads", "all_reads")) %>%
  mutate(perc = total_reads/all_reads * 100) %>% 
  mutate(dataset = "individual")


all_num <- bind_rows(individual_total_num, total_num)

plot_mapping <- function(dat){
  p <- ggplot(dat, aes(x = phenotype, y = perc, fill = phenotype)) + geom_boxplot() +
    geom_dotplot(binaxis='y', stackdir='center', position=position_dodge(0.7), dotsize = 0.5) + 
    mytheme + labs(x = "", y = "Full-Length clustered reads (%)", title = "\n\n") +
    scale_fill_manual(values = c(label_colour("TG"),label_colour("WT"),label_colour("Merged"))) + 
    theme(legend.position = "none",
          strip.background = element_blank(),
          strip.text.x = element_blank()) +
    facet_grid(~dataset,scales = "free_x")
  
  return(p)
}

plot_mapping(all_num %>% filter(type == "Mouse_mapped")) 

Post Filtering

## Total unique ERCCs in both WT and TG (post filtering on length and identity): 57 (61.96%)

Discarded ERCC

Rationale for discarded ERCC (1 isoform) from cupcake collapse
Cupcake’s parameters: minimum alignment coverage = 0.99, minimum alignment identity = 0.95

Readjustment of Demultiplex

Unique ERCC

Reduced cupcake’s collapse default parameters (0.99) to 0.95

## Total unique ERCCs in both WT and TG: 57 (61.96%)

Redundant Isoforms

+ TAMA

To use TAMA’s tama_remove_fragment_models.py to remove shorter redundant “isoforms” from the same ERCCs. Script is applied after SQANTI filtering.

TAMA <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Whole_Transcriptome/All_Samples/ERCC/TOFU_ADJUST/TAMA_Filter/"
tama_retained_adjusted <- read.table(paste0(TAMA,"All_Merged_postsqanti.bed")) %>% mutate(TAMA = "Retain") 
tama_discard <- read.table(paste0(TAMA,"All_Merged_postsqanti_discarded.txt")) %>% mutate(TAMA = "Discard")
tama <- bind_rows(tama_retained_adjusted, tama_discard) %>% mutate(isoform = word(.$V4, c(2),sep = ";"))

p19 <- tama %>% group_by(V1,TAMA) %>% tally() %>%
  ggplot(., aes(x = reorder(V1,-n), y = n, fill = TAMA)) + geom_bar(stat = "identity") + labs(y = "Number of Isoforms", x = "ERCC") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  mytheme

# reads filtered from TAMA 
# total FL reads across 10 samples associated with isoform 
p20 <- class[class$chrom %in% tama_discard$V1,] %>%
  mutate(Total_FL = .[, 46:55] %>% rowSums(na.rm = TRUE)) %>%
  left_join(., tama, by = ("isoform")) %>% 
  mutate(log10_Total_FL = log10(Total_FL)) %>%
  ggplot(., aes(x = chrom, y = log10_Total_FL, colour = TAMA)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  mytheme + labs(y = "FL Reads (Log10)", x = "ERCC") 

p11 <- class %>%
  mutate(Total_FL = .[, 46:55] %>% rowSums(na.rm = TRUE)) %>%
  left_join(., tama, by = ("isoform")) %>% 
  mutate(log10_Total_FL = log10(Total_FL)) %>%
  ggplot(., aes(x = chrom, y = log10_Total_FL, colour = TAMA)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  mytheme + labs(y = "FL Reads (Log10)", x = "ERCC") 

p19 

Correlation

## 
##  Pearson's product-moment correlation
## 
## data:  dat[[x.var]] and dat[[y.var]]
## t = 38.688, df = 55, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9697060 0.9894717
## sample estimates:
##      cor 
## 0.982118 
## 
## [1] ""
## [1] "Correlation betweenlog2_amount_of_ERCCandlog2_FL_reads"
## [1] "corr.value0.982117953981347"
## [1] "p.value1.41213399302975e-41"

## png 
##   2